import plotly.express as px
import plotly.graph_objects as go
from dash import dcc
from dash import html
from dash import Input, Output
import dash
import jupyter_dash
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from jupyter_dash import JupyterDash
import gc, os
import requests
from holoviews import opts, dim
import holoviews as hv
hv.extension('bokeh')
hv.output(size=200)
df=pd.read_csv('data_raw-001.csv')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 13774715 entries, 0 to 13774714 Data columns (total 27 columns): # Column Dtype --- ------ ----- 0 trip_id int64 1 usertype object 2 gender object 3 starttime object 4 stoptime object 5 tripduration int64 6 from_station_id int64 7 from_station_name object 8 latitude_start float64 9 longitude_start float64 10 dpcapacity_start float64 11 to_station_id int64 12 to_station_name object 13 latitude_end float64 14 longitude_end float64 15 dpcapacity_end float64 16 temperature float64 17 windchill float64 18 dewpoint float64 19 humidity float64 20 pressure float64 21 visibility float64 22 wind_speed float64 23 precipitation float64 24 events object 25 rain int64 26 conditions object dtypes: float64(14), int64(5), object(8) memory usage: 2.8+ GB
(df.isnull().sum()/len(df)*100) #checking null values
trip_id 0.000000 usertype 0.000000 gender 27.274118 starttime 0.000000 stoptime 0.000000 tripduration 0.000000 from_station_id 0.000000 from_station_name 0.000000 latitude_start 0.008370 longitude_start 0.008370 dpcapacity_start 0.008370 to_station_id 0.000000 to_station_name 0.000000 latitude_end 0.008566 longitude_end 0.008566 dpcapacity_end 0.008566 temperature 0.000000 windchill 0.000000 dewpoint 0.000000 humidity 0.006679 pressure 0.000000 visibility 0.000000 wind_speed 0.000000 precipitation 0.000000 events 0.000000 rain 0.000000 conditions 0.000000 dtype: float64
df.dropna(axis=0,inplace=True)
df['starttime']=pd.to_datetime(df['starttime'])
df['stoptime']=pd.to_datetime(df['stoptime'])
df=df[(df.usertype == 'Subscriber')]
df['Hour']=pd.DatetimeIndex(df.starttime).hour
df['Year'] = pd.DatetimeIndex(df.starttime).year
df['Month'] = pd.DatetimeIndex(df.starttime).month
df['Quarter']=pd.DatetimeIndex(df.starttime).to_period('Q')
df['WeekDay'] = pd.DatetimeIndex(df.starttime).weekday
df['Day'] = pd.DatetimeIndex(df.starttime).day
df['Tripduration(Mins)']=df.tripduration.apply(lambda x: round(x/60)) #converting mins to secs
df.drop(['conditions','precipitation','dpcapacity_start','dpcapacity_end','rain','windchill'],axis=1,inplace=True)
df.columns
Index(['trip_id', 'usertype', 'gender', 'starttime', 'stoptime',
'tripduration', 'from_station_id', 'from_station_name',
'latitude_start', 'longitude_start', 'to_station_id', 'to_station_name',
'latitude_end', 'longitude_end', 'temperature', 'dewpoint', 'humidity',
'pressure', 'visibility', 'wind_speed', 'events', 'Hour', 'Year',
'Month', 'Quarter', 'WeekDay', 'Day', 'Tripduration(Mins)'],
dtype='object')
df['Month']=df['Month'].map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'})
df['WeekDay']=df['WeekDay'].map({0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'})
def Seasons(s):
if(s == 'Dec' or s == 'Jan' or s == 'Feb'):
return 'Winter'
if(s == 'Mar' or s == 'Apr' or s == 'May'):
return 'Spring'
if(s == 'Jun' or s == 'Jul' or s == 'Aug'):
return 'Summer'
if(s == 'Sep' or s == 'Oct' or s == 'Nov'):
return 'Fall'
df['Seasons']=df.Month.apply(Seasons)
#df.to_csv('df_cleaned.csv')
#df_cleaned=pd.read_csv('df_cleaned.csv')
#df_cleaned.head(2)
| Unnamed: 0 | trip_id | usertype | gender | starttime | stoptime | tripduration | from_station_id | from_station_name | latitude_start | ... | wind_speed | events | Hour | Year | Month | Quarter | WeekDay | Day | Tripduration(Mins) | Seasons | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 4095 | Subscriber | Male | 2013-06-27 12:06:00 | 2013-06-27 12:11:00 | 301 | 85 | Michigan Ave & Oak St | 41.90096 | ... | 13.8 | mostlycloudy | 12 | 2013 | Jun | 2013Q2 | Thu | 27 | 5 | Summer |
| 1 | 2 | 4192 | Subscriber | Male | 2013-06-27 12:15:00 | 2013-06-27 12:16:00 | 60 | 28 | Larrabee St & Menomonee St | 41.91468 | ... | 13.8 | mostlycloudy | 12 | 2013 | Jun | 2013Q2 | Thu | 27 | 1 | Summer |
2 rows × 30 columns
#df_cleaned.drop('Unnamed: 0',axis=1,inplace=True)
Outliers_tripduration_mins = df_cleaned['Tripduration(Mins)']
Outliers_tripduration_mins_Q1 = Outliers_tripduration_mins.quantile(0.25) # 1st Quartile
Outliers_tripduration_mins_Q3 = Outliers_tripduration_mins.quantile(0.75) # 3rd Quartile
Outliers_tripduration_mins_IQR= Outliers_tripduration_mins_Q3-Outliers_tripduration_mins_Q1 # Inter Quartile Range
Outliers_tripduration_mins_Lowerrange=Outliers_tripduration_mins_Q1-(1.5 * Outliers_tripduration_mins_IQR)
Outliers_tripduration_mins_Upperrange=Outliers_tripduration_mins_Q3+(1.5 * Outliers_tripduration_mins_IQR)
print(Outliers_tripduration_mins_Q1, " -> 1st Quartile")
print(Outliers_tripduration_mins_Q3, " -> 3rd Quartile")
print(Outliers_tripduration_mins_IQR, " -> Inter Quartile Range")
print(Outliers_tripduration_mins_Lowerrange, " -> Outliers below the lower Range")
print(Outliers_tripduration_mins_Upperrange, " -> Outliers Above the Upper Range")
6.0 -> 1st Quartile 15.0 -> 3rd Quartile 9.0 -> Inter Quartile Range -7.5 -> Outliers below the lower Range 28.5 -> Outliers Above the Upper Range
Outliers_tripduration_mins_outliers=df_cleaned[(Outliers_tripduration_mins < Outliers_tripduration_mins_Lowerrange) |
(Outliers_tripduration_mins > Outliers_tripduration_mins_Upperrange)]
print((len(Outliers_tripduration_mins_outliers)/len(df_cleaned['Tripduration(Mins)']))*100,"%")
<ipython-input-42-2b024399c4ef>:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
2.729261949511351 %
TripDuration_Analysis=df_cleaned[(df_cleaned['Tripduration(Mins)'] >= 2) & (df_cleaned['Tripduration(Mins)'] <= 28.5)]
#TripDuration_Analysis=df_cleaned
TripDuration_Analysis.head(2)
| Unnamed: 0 | trip_id | usertype | gender | starttime | stoptime | tripduration | from_station_id | from_station_name | latitude_start | ... | wind_speed | events | Hour | Year | Month | Quarter | WeekDay | Day | Tripduration(Mins) | Seasons | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 4095 | Subscriber | Male | 2013-06-27 12:06:00 | 2013-06-27 12:11:00 | 301 | 85 | Michigan Ave & Oak St | 41.90096 | ... | 13.8 | mostlycloudy | 12 | 2013 | Jun | 2013Q2 | Thu | 27 | 5 | Summer |
| 4 | 7 | 4289 | Subscriber | Female | 2013-06-27 14:57:00 | 2013-06-27 15:05:00 | 487 | 32 | Racine Ave & Congress Pkwy | 41.87464 | ... | 17.3 | partlycloudy | 14 | 2013 | Jun | 2013Q2 | Thu | 27 | 8 | Summer |
2 rows × 30 columns
#TripDuration_Analysis['Hour_Descrete']=TripDuration_Analysis.hour.apply(update_hour)
df_TripComparison=TripDuration_Analysis[['gender','Quarter','Seasons','Month','WeekDay','Hour','trip_id','Tripduration(Mins)']]
df_TripComparison=df_TripComparison.groupby(['gender','Month','Seasons','WeekDay'],
as_index=False).agg({'Tripduration(Mins)': 'mean'})
fig_tripduration = px.sunburst(df_TripComparison, path=['gender','Month','Seasons','WeekDay'],
values='Tripduration(Mins)',
color='Tripduration(Mins)',
color_continuous_scale='blues',
maxdepth=5)
fig_tripduration.add_trace(go.Sunburst(insidetextorientation='radial'))
fig_tripduration=fig_tripduration.update_layout(margin=dict(t=10, b=10, r=100, l=10), coloraxis_colorbar_x=0.8,autosize=True,
coloraxis_colorbar=dict(
title="Trip Duration (In Mins)"
))
fig_tripduration.show()
df_cleaned=df_cleaned[df_cleaned.events != 'unknown']
app = JupyterDash(__name__)
colors={
'background':'white',
'text':'#7FDBFF'
}
app.layout = html.Div([
html.H1("Sunburst Chart", style={'color':'#193498'}),
html.H3("Overview of the Monthly, Seasonal & Weekly Analysis of the Trip Durations of Male & Female Cyclists",
style={'color':'black','marginBottom':'30px'}),
dcc.Interval(
id='interval-component',
#interval=1,
#n_intervals=0
),
dcc.Graph(id='graph'),
])
@app.callback(
Output('graph', 'figure'),
[Input("interval-component"," ")]
)
def update_figure(colorscale):
'''return px.scatter(
df, x="total_bill", y="tip", color="size",
color_continuous_scale=colorscale,
render_mode="webgl", title="Tips"
)'''
return fig_tripduration
app.run_server(mode='external')
Dash app running on http://127.0.0.1:8050/
Trips=df_cleaned[['from_station_name','to_station_name','Tripduration(Mins)']].value_counts()
Trips.head(10)
trip
from_station_name to_station_name Tripduration(Mins) Southport Ave & Wellington Ave Sheffield Ave & Wellington Ave 4 4374 Sheffield Ave & Wellington Ave Southport Ave & Wellington Ave 4 4070 Ashland Ave & Division St Damen Ave & Division St 3 3503 Broadway & Belmont Ave Wilton Ave & Belmont Ave 3 3245 LaSalle St & Jackson Blvd Canal St & Madison St 5 3167 Southport Ave & Wrightwood Ave Sheffield Ave & Fullerton Ave 5 3108 Sheridan Rd & Irving Park Rd Pine Grove Ave & Irving Park Rd 3 3017 Sheffield Ave & Fullerton Ave Southport Ave & Wrightwood Ave 5 2724 Wilton Ave & Diversey Pkwy Hampden Ct & Diversey Pkwy 4 2687 Calumet Ave & 33rd St State St & 33rd St 3 2552 dtype: int64
Trip_plot=pd.DataFrame.from_records(list(Trips.index),columns=['Source Station','Destination Station','Tripduration(Mins)'])
#Trip_plot['Trips']=Trips.values
Trip_plot=Trip_plot.sort_values(by='Tripduration(Mins)',ascending=False)
Trip_plot=Trip_plot.head(10)
Trip_plot
| Source Station | Destination Station | Tripduration(Mins) | |
|---|---|---|---|
| 1015684 | Damen Ave & Division St | Western Ave & Walton St | 1437 |
| 722782 | South Shore Dr & 67th St | Jeffery Blvd & 71st St | 1437 |
| 686012 | Sheffield Ave & Kingsbury St | Dearborn St & Erie St | 1436 |
| 959152 | Harper Ave & 59th St | Ellis Ave & 60th St | 1435 |
| 1058209 | Ellis Ave & 58th St | Ellis Ave & 55th St | 1435 |
| 1099826 | Damen Ave & Sunnyside Ave | Leavitt St & Lawrence Ave | 1435 |
| 658172 | State St & Kinzie St | Albany Ave & Bloomingdale Ave | 1434 |
| 708256 | Southport Ave & Irving Park Rd | McClurg Ct & Illinois St | 1433 |
| 960784 | Hampden Ct & Diversey Pkwy | Loomis St & Jackson Blvd | 1432 |
| 825505 | Canal St & Adams St | Ogden Ave & Congress Pkwy | 1432 |
Chord_Chart=hv.Chord(Trip_plot)
Chord_Chart=Chord_Chart.opts(node_color='index', edge_color='Source Station',label_index='index',cmap='Category10',
edge_cmap='Category10', show_legend=False, width=350, height=350)
Chord_Chart